# Computations
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from collections import Counter
# Visualisation libraries
## progressbar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
##
import missingno as msno
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib import cm
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze a dataset for Netflix tv shows and movies available at kaggle.com.
This dataset consists of tv shows and movies available on Netflix as of 2019. The dataset is collected from Flexible which is a third-party Netflix search engine.
In 2018, they released an interesting report which shows that the number of TV shows on Netflix has nearly tripled since 2010. The streaming service’s number of movies has decreased by more than 2,000 titles since 2010, while its number of TV shows has nearly tripled. It will be interesting to explore what all other insights can be obtained from the same dataset.
Integrating this dataset with other external datasets such as IMDB ratings, rotten tomatoes can also provide many interesting findings.
Some of the interesting questions (tasks) which can be performed on this dataset -
Understanding what content is available in different countries Identifying similar content by matching text-based features Network analysis of Actors / Directors and find interesting insights Is Netflix has increasingly focusing on TV rather than movies in recent years?
Data = pd.read_csv("Data/netflix_titles.csv")
Data.head(5)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 81145628 | Movie | Norm of the North: King Sized Adventure | Richard Finn, Tim Maltby | Alan Marriott, Andrew Toth, Brian Dobson, Cole... | United States, India, South Korea, China | September 9, 2019 | 2019 | TV-PG | 90 min | Children & Family Movies, Comedies | Before planning an awesome wedding for his gra... |
| 1 | 80117401 | Movie | Jandino: Whatever it Takes | NaN | Jandino Asporaat | United Kingdom | September 9, 2016 | 2016 | TV-MA | 94 min | Stand-Up Comedy | Jandino Asporaat riffs on the challenges of ra... |
| 2 | 70234439 | TV Show | Transformers Prime | NaN | Peter Cullen, Sumalee Montano, Frank Welker, J... | United States | September 8, 2018 | 2013 | TV-Y7-FV | 1 Season | Kids' TV | With the help of three human allies, the Autob... |
| 3 | 80058654 | TV Show | Transformers: Robots in Disguise | NaN | Will Friedle, Darren Criss, Constance Zimmer, ... | United States | September 8, 2018 | 2016 | TV-Y7 | 1 Season | Kids' TV | When a prison ship crash unleashes hundreds of... |
| 4 | 80125979 | Movie | #realityhigh | Fernando Lebrija | Nesta Cooper, Kate Walsh, John Michael Higgins... | United States | September 8, 2017 | 2017 | TV-14 | 99 min | Comedies | When nerdy high schooler Dani finally attracts... |
First off, let's take a look at NaN values.
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out ['Size'] = Inp.shape[0]
Out['Percentage'] = 100 - np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
Out.index.name = 'Features'
Out['Data Type'] = Out['Data Type'].astype(str)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
data_info = Data_info(Data).reset_index(drop = False)
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightBlue', 'PeachPuff'], hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1), width = 700)
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
fig.show()
Moreover, the dendrogram graph shows correlate variable completion.
_ = msno.dendrogram(Data, figsize=(16,6), fontsize=14)
We can replace all nan values in date_added column with January 01, 2020.
Data['date_added'] = pd.to_datetime(Data['date_added'].fillna('January 01, 2020'))
Adding features
Data['year_added'] = Data['date_added'].apply(lambda x: x.year).astype(int)
Data['month_added'] = Data['date_added'].apply(lambda x: x.month).astype(int)
Data['month_name'] = Data['date_added'].apply(lambda x: x.month_name())
Temp0 = Data.loc[Data.year_added< 2020].groupby(['year_added'])['year_added'].agg({'count'})
Temp0.sort_values(inplace = True, by=['year_added'])
Temp0.reset_index(inplace = True, drop= False)
Temp1 = Data.loc[Data.year_added< 2020].groupby(['type','year_added'])['year_added'].agg({'count'})
Temp1.sort_values(inplace = True, by=['year_added'])
Temp1.reset_index(inplace = True, drop= False)
Group = Temp1.merge(Temp0, left_on='year_added', right_on='year_added')
Group.rename(columns = {'count_x': 'count', 'count_y': 'total'},inplace = True)
Group['Percentage'] = 100*(Group['count']/Group['total'])
Group = Group.round(2)
del Temp0, Temp1
display(Group)
| type | year_added | count | total | Percentage | |
|---|---|---|---|---|---|
| 0 | Movie | 2008 | 1 | 2 | 50.00 |
| 1 | TV Show | 2008 | 1 | 2 | 50.00 |
| 2 | Movie | 2009 | 2 | 2 | 100.00 |
| 3 | Movie | 2010 | 1 | 1 | 100.00 |
| 4 | Movie | 2011 | 13 | 13 | 100.00 |
| 5 | Movie | 2012 | 4 | 7 | 57.14 |
| 6 | TV Show | 2012 | 3 | 7 | 42.86 |
| 7 | Movie | 2013 | 6 | 12 | 50.00 |
| 8 | TV Show | 2013 | 6 | 12 | 50.00 |
| 9 | Movie | 2014 | 19 | 25 | 76.00 |
| 10 | TV Show | 2014 | 6 | 25 | 24.00 |
| 11 | TV Show | 2015 | 32 | 90 | 35.56 |
| 12 | Movie | 2015 | 58 | 90 | 64.44 |
| 13 | TV Show | 2016 | 192 | 456 | 42.11 |
| 14 | Movie | 2016 | 264 | 456 | 57.89 |
| 15 | TV Show | 2017 | 387 | 1300 | 29.77 |
| 16 | Movie | 2017 | 913 | 1300 | 70.23 |
| 17 | Movie | 2018 | 1290 | 1782 | 72.39 |
| 18 | TV Show | 2018 | 492 | 1782 | 27.61 |
| 19 | Movie | 2019 | 1546 | 2349 | 65.82 |
| 20 | TV Show | 2019 | 803 | 2349 | 34.18 |
fig, ax = plt.subplots(2, 1, figsize=(14, 12))
_ = Group.loc[Group.type == 'Movie'].plot(ax =ax[0], x= 'year_added',
y = 'total', color= '#34495e', linewidth=2, label='Total')
_ = Group.loc[Group.type == 'Movie'].plot(ax= ax[0], x= 'year_added',
y = 'count', color='#e74c3c', linewidth=2, label='Movies')
_ = Group.loc[Group.type == 'TV Show'].plot(ax= ax[0], x= 'year_added',
y = 'count', color='#9b59b6', linewidth=2, label='TV Shows')
_ = ax[0].set_ylim([0,2500])
_ = ax[0].set_xlim([Group.year_added.min(), Group.year_added.max()])
_ = ax[0].set_xticks(np.arange(Group.year_added.min(), Group.year_added.max()+1, 1))
_ = ax[0].set_xlabel('Year Added')
_ = ax[0].set_ylabel('Count')
_ = ax[0].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = sns.barplot(ax = ax[1], x="year_added", y="Percentage", hue = 'type', palette='Purples',
edgecolor='k', hatch="///", data=Group)
_ = ax[1].legend(bbox_to_anchor=(1, 1), fontsize = 14)
_ = ax[1].set_ylim([0,100])
_ = ax[1].set_yticks(np.arange(0, 101, 10))
_ = ax[1].set_xlabel('Year Added')
_ = ax[1].set_ylabel('Percentage')
Group = Data.loc[Data.year_added<2020].groupby(['year_added',
'month_added'])['year_added'].agg({'count'}).unstack().fillna(0).T
Temp = Data[['month_added','month_name']].sort_values(by=['month_added'])
Temp = Temp.month_name.unique().tolist()
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 8))
cs = ax.pcolor(Group, cmap='BuGn', edgecolors='white', linewidths=2)
_ = ax.set_xticks(np.arange(0.5, len(Group.columns), 1))
_ = ax.set_xticklabels(Group.columns, fontsize = 14)
_ = ax.set_yticks(np.arange(0.5, len(Group.index), 1))
_ = ax.set_yticklabels(Temp, fontsize = 14)
_ = ax.set_title('Contents Added/Updated by Months', fontsize=14)
_ = fig.colorbar(cs, ax=ax, shrink=1)
Comparing this data with Netflix timeline data, we can get some explanations behind this growth.
We can define the following function to separate the values in each column.
def Col_Sep(Col, df = Data):
Out = pd.Series(dict(Counter(", ".join(df[Col].dropna()).split(", "))))
Out = Out.sort_values(ascending = False)
Out = pd.DataFrame(Out).reset_index(drop = False)
return Out
Group = Col_Sep('country')
Group.columns = ['Country','Count']
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(13, 6))
_ = sns.barplot(ax = ax[0], y="Country", x="Count", palette='summer',
edgecolor='k', hatch="///", data = Group[:10])
_ = ax[0].set_xlim([0, 3e3])
Top = 10
E = [0 for _ in range(Top)]
E[0] = 0.05
_ = Group[:Top].plot.pie(ax = ax[1],y= 'Count', startangle=90, label = '', labels = None,
colors = sns.color_palette("Paired"), legend=True, autopct='%1.1f%%', fontsize=10,
pctdistance=0.85, explode = E)
_ = ax[1].legend(bbox_to_anchor=(1, 1), labels= Group[:Top].Country)
_ = ax[1].add_artist(plt.Circle((0,0),0.70,fc='white'))
_ = ax[1].set_xlabel('Top %i Content Producers' % Top, fontsize = 14)
Netflix used the following table to decide the maturity rating on TV shows and movies 2].
| Little Kids | Older Kids | Teens | Mature |
|---|---|---|---|
| G, TV-Y, TV-G | PG, TV-Y7, TV-Y7-FV, TV-PG | PG-13, TV-14 | R, NC-17, TV-MA |
Moreover, the Motion Picture Association of America film rating system (MPAA) film ratings are as follows
| Rating | Meaning |
|---|---|
| G | General Audiences |
| PG | Parental Guidance Suggested |
| PG-13 | Parents Stongly Cautioned |
| R | Restricted |
| NC-17 | Adults Only |
| Unrated | Unrated |
Group = Data.dropna().groupby(['type','rating'])['rating'].agg({'count'})
Group.reset_index(drop = False, inplace = True)
print('Movie Ratings:')
Group['rating group'] = np.nan
Group.loc[Group.rating.isin(['G', 'TV-Y', 'TV-G']), 'rating group'] = 'Little Kids'
Group.loc[Group.rating.isin(['PG', 'TV-Y7', 'TV-Y7-FV', 'TV-PG']), 'rating group'] = 'Older Kids'
Group.loc[Group.rating.isin(['PG-13', 'TV-14']), 'rating group'] = 'Teens'
Group.loc[Group.rating.isin(['R', 'NC-17', 'TV-MA']), 'rating group'] = 'Mature'
Group.loc[Group.rating.isin(['NR', 'UR']), 'rating group'] = 'Unrated'
Group.sort_values(by=['rating group'], inplace = True)
display(Group.style.hide_index())
Movie Ratings:
| type | rating | count | rating group |
|---|---|---|---|
| Movie | G | 35 | Little Kids |
| TV Show | TV-G | 1 | Little Kids |
| TV Show | TV-Y | 3 | Little Kids |
| Movie | TV-G | 53 | Little Kids |
| Movie | TV-Y | 21 | Little Kids |
| Movie | R | 500 | Mature |
| Movie | TV-MA | 1157 | Mature |
| TV Show | R | 1 | Mature |
| Movie | NC-17 | 1 | Mature |
| TV Show | TV-MA | 32 | Mature |
| Movie | PG | 176 | Older Kids |
| Movie | TV-PG | 344 | Older Kids |
| Movie | TV-Y7 | 42 | Older Kids |
| Movie | TV-Y7-FV | 11 | Older Kids |
| TV Show | TV-PG | 14 | Older Kids |
| TV Show | TV-Y7 | 6 | Older Kids |
| Movie | TV-14 | 878 | Teens |
| TV Show | TV-14 | 39 | Teens |
| Movie | PG-13 | 278 | Teens |
| Movie | NR | 175 | Unrated |
| Movie | UR | 7 | Unrated |
Therefore,
RG = Group.loc[(Group.type == 'Movie'), 'rating group'].unique()
fig, ax = plt.subplots(nrows=1, ncols=len(RG), figsize=(13, 6), sharey = True)
C = ['forestgreen','mediumseagreen','bisque','coral','gray']
for i in range(len(RG)):
Temp = Group.loc[(Group.type == 'Movie') & (Group['rating group'] == RG[i])]
_ = sns.barplot(ax=ax[i], x='rating', y ='count', data=Temp, linewidth=2.5,
facecolor=C[i], errcolor=".3", edgecolor=".3", hatch = '//')
_ = ax[i].set_xlabel(None)
_ = ax[i].set_ylabel(None)
_ = ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
_ = ax[0].set_ylim([0,1400])
_ = ax[0].set_ylabel('Count')
st = fig.suptitle("Netflix Maturity Rating (Movies)", fontsize="x-large")
st.set_y(0)
del RG, C, Temp
fig.tight_layout()
Group.loc[(Group.type == 'TV Show')]
| type | rating | count | rating group | |
|---|---|---|---|---|
| 16 | TV Show | TV-G | 1 | Little Kids |
| 19 | TV Show | TV-Y | 3 | Little Kids |
| 14 | TV Show | R | 1 | Mature |
| 17 | TV Show | TV-MA | 32 | Mature |
| 18 | TV Show | TV-PG | 14 | Older Kids |
| 20 | TV Show | TV-Y7 | 6 | Older Kids |
| 15 | TV Show | TV-14 | 39 | Teens |
RG = Group.loc[(Group.type == 'TV Show'), 'rating group'].unique()
fig, ax = plt.subplots(nrows=1, ncols=len(RG), figsize=(10, 6), sharey = True)
C = ['forestgreen','mediumseagreen','bisque','coral','gray']
for i in range(len(RG)):
Temp = Group.loc[(Group.type == 'TV Show') & (Group['rating group'] == RG[i])]
_ = sns.barplot(ax=ax[i], x='rating', y ='count', data=Temp, linewidth=2.5,
facecolor=C[i], errcolor=".3", edgecolor=".3", hatch = '//')
_ = ax[i].set_xlabel(None)
_ = ax[i].set_ylabel(None)
_ = ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
_ = ax[0].set_ylim([0,50])
_ = ax[0].set_ylabel('Count')
st = fig.suptitle("Netflix Maturity Rating (TV Shows)", fontsize="x-large")
st.set_y(0)
del RG, C, Temp
fig.tight_layout()
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6), sharey = True)
Temp = Group.groupby(['rating group'])['count'].agg({'sum'})
E = [0.06 for _ in range(len(Temp))]
# E[0] = 0.05
_ = Temp.plot.pie(ax = ax,y= 'sum', startangle=90, label = '', labels = None,
colors = sns.color_palette("Paired"), legend=True, autopct='%1.1f%%', fontsize=12,
pctdistance=0.85, explode = E, wedgeprops={"linewidth": 1.5, "edgecolor": 'gray'})
_ = ax.legend(bbox_to_anchor=(2, 0.7), loc="center left", labels= Temp.index, fontsize = 14)
_ = ax.add_artist(plt.Circle((0,0),0.70,fc='white'))
_ = ax.set_title('Netflix Rating Groups')
Group = Col_Sep('listed_in')
Group.columns = ['Category','Count']
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6), sharey = True)
_ = sns.barplot(ax = ax, x="Category", y="Count", palette='RdPu', edgecolor='k', hatch="///", data=Group)
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize = 12)
_ = ax.set_ylim([0,2e3])
Top = 10
Group = Col_Sep('country', Data.loc[Data.type == 'Movie'])
Group.columns = ['Country','Count']
Temp = Group[:5]['Country'].tolist()
del Group
i = Temp[0]
Temp0 =Data.loc[(Data['country'].str.contains(i, regex=False) ==True) & (Data.type == 'Movie')]
Temp0 = Col_Sep('cast', df = Temp0)[:Top]
Temp0['Country'] = i
Group = Temp0.copy()
for i in Temp[1:]:
Temp0 =Data.loc[(Data['country'].str.contains(i, regex=False) ==True) & (Data.type == 'Movie')]
Temp0 = Col_Sep('cast', df = Temp0)[:Top]
Temp0['Country'] = i
Group = pd.concat([Group, Temp0])
del Temp0
Group.columns = ['Actor', 'Count', 'Country']
fig, ax = plt.subplots(nrows=1, ncols=len(Temp), figsize=(16, 6), sharey = True)
C = ['Blues','OrRd','winter','spring','summer']
for i in range(len(Temp)):
Temp0 = Group.loc[(Group['Country'] == Temp[i])]
_ = sns.barplot(ax=ax[i], x='Actor', y ='Count', data=Temp0, palette= C[i], edgecolor='k', hatch="///")
_ = ax[i].set_xlabel(Temp[i])
_ = ax[i].set_ylabel(None)
_ = ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
del Temp, Temp0
_ = ax[0].set_ylim([0,35])
_ = ax[0].set_ylabel('Count')
st = fig.suptitle("Most appeared movie actors in Netflix library by country", fontsize="x-large")
st.set_y(1)
Top = 10
Group = Col_Sep('country', Data.loc[Data.type == 'TV Show'])
Group.columns = ['Country','Count']
Temp = Group[:5]['Country'].tolist()
del Group
i = Temp[0]
Temp0 =Data.loc[(Data['country'].str.contains(i, regex=False) ==True) & (Data.type == 'TV Show')]
Temp0 = Col_Sep('cast', df = Temp0)[:Top]
Temp0['Country'] = i
Group = Temp0.copy()
for i in Temp[1:]:
Temp0 =Data.loc[(Data['country'].str.contains(i, regex=False) ==True) & (Data.type == 'TV Show')]
Temp0 = Col_Sep('cast', df = Temp0)[:Top]
Temp0['Country'] = i
Group = pd.concat([Group, Temp0])
del Temp0
Group.columns = ['Actor', 'Count', 'Country']
fig, ax = plt.subplots(nrows=1, ncols=len(Temp), figsize=(16, 6), sharey = True)
C = ['Blues','OrRd','winter','spring','summer']
for i in range(len(Temp)):
Temp0 = Group.loc[(Group['Country'] == Temp[i])]
_ = sns.barplot(ax=ax[i], x='Actor', y ='Count', data=Temp0, palette= C[i], edgecolor='k', hatch="///")
_ = ax[i].set_xlabel(Temp[i])
_ = ax[i].set_ylabel(None)
_ = ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90)
del Temp, Temp0
_ = ax[0].set_ylim([0,18])
_ = ax[0].set_ylabel('Count')
st = fig.suptitle("Most appeared TV show actors in Netflix library by country", fontsize="x-large")
st.set_y(1)